In [1]:
import warnings
warnings.filterwarnings("ignore")

import os

import pandas as pd
import numpy as np
import plotly.express as px

import missingno as msno
In [2]:
#################################################################
##### Setting pandas display options for pretty print
#################################################################

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
pd.set_option('display.max_colwidth', 50)

Constant parameters¶

In [3]:
# COVID data URL picked from https://github.com/owid/covid-19-data/tree/master/public/data
COVID_DATA_URL = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
COVID_DATA_FILE_NAME = "owid-covid-data.csv"

COVID_DATA_DICTIONARY_URL = "https://covid.ourworldindata.org/data/owid-covid-codebook.csv"
COVID_DATA_DICTIONARY_FILE_NAME = "owid-covid-codebook.csv"

RESULTS_PATH = "EDS_Covid19_Exp_results"

DATA_FOLDER = "data"

Generated Parameters¶

In [4]:
covid_data_full_file_path = os.path.join(RESULTS_PATH, 
                                         DATA_FOLDER, 
                                         COVID_DATA_FILE_NAME)

covid_data_dictionary_full_file_path = os.path.join(RESULTS_PATH, 
                                                    DATA_FOLDER, 
                                                    COVID_DATA_DICTIONARY_FILE_NAME)
In [5]:
# Creating experiment folders as required
if not os.path.isdir(RESULTS_PATH):
    os.makedirs(RESULTS_PATH)
    
if not os.path.isdir(os.path.join(RESULTS_PATH, 
                                  DATA_FOLDER)):
    os.makedirs(os.path.join(RESULTS_PATH, 
                             DATA_FOLDER))

Fetching Data¶

In [6]:
#################################################################
##### Covid data 
#################################################################

if os.path.isfile(covid_data_full_file_path):
    print("Local data file available:", covid_data_full_file_path)
    df_full_covid_data = pd.read_csv(covid_data_full_file_path, 
                                     sep=",", 
                                     header=0)
    print("Data load complete.")
else:
    print("No local data file found. Downloading from URL:", COVID_DATA_URL)
    df_full_covid_data = pd.read_csv(COVID_DATA_URL, 
                                     sep=",", 
                                     header=0)
    df_full_covid_data.to_csv(covid_data_full_file_path, 
                              sep=",", 
                              header=True, 
                              index=False)
    print("Data downloaded and saved to:", covid_data_full_file_path)

#################################################################
##### Covid data dictionary
#################################################################

if os.path.isfile(covid_data_dictionary_full_file_path):
    print("\nLocal dictionary file available:", covid_data_dictionary_full_file_path)
    df_covid_data_dict = pd.read_csv(covid_data_dictionary_full_file_path, 
                                     sep=",", 
                                     header=0)
    print("Dictionary load complete.")
else:
    print("\nNo local dictionary file found. Downloading from URL:", COVID_DATA_DICTIONARY_URL)
    df_covid_data_dict = pd.read_csv(COVID_DATA_DICTIONARY_URL, 
                                     sep=",", 
                                     header=0)
    df_covid_data_dict.to_csv(covid_data_dictionary_full_file_path, 
                              sep=",", 
                              header=True, 
                              index=False)
    print("Dictionary downloaded and saved to:", covid_data_dictionary_full_file_path)
Local data file available: EDS_Covid19_Exp_results\data\owid-covid-data.csv
Data load complete.

Local dictionary file available: EDS_Covid19_Exp_results\data\owid-covid-codebook.csv
Dictionary load complete.
In [7]:
#################################################################
##### codebook - contains details of every column specified by data provider
#################################################################

df_covid_data_dict[['column', 'description']]
Out[7]:
column description
0 iso_code ISO 3166-1 alpha-3 – three-letter country codes
1 continent Continent of the geographical location
2 location Geographical location
3 date Date of observation
4 total_cases Total confirmed cases of COVID-19. Counts can ...
5 new_cases New confirmed cases of COVID-19. Counts can in...
6 new_cases_smoothed New confirmed cases of COVID-19 (7-day smoothe...
7 total_deaths Total deaths attributed to COVID-19. Counts ca...
8 new_deaths New deaths attributed to COVID-19. Counts can ...
9 new_deaths_smoothed New deaths attributed to COVID-19 (7-day smoot...
10 total_cases_per_million Total confirmed cases of COVID-19 per 1,000,00...
11 new_cases_per_million New confirmed cases of COVID-19 per 1,000,000 ...
12 new_cases_smoothed_per_million New confirmed cases of COVID-19 (7-day smoothe...
13 total_deaths_per_million Total deaths attributed to COVID-19 per 1,000,...
14 new_deaths_per_million New deaths attributed to COVID-19 per 1,000,00...
15 new_deaths_smoothed_per_million New deaths attributed to COVID-19 (7-day smoot...
16 reproduction_rate Real-time estimate of the effective reproducti...
17 icu_patients Number of COVID-19 patients in intensive care ...
18 icu_patients_per_million Number of COVID-19 patients in intensive care ...
19 hosp_patients Number of COVID-19 patients in hospital on a g...
20 hosp_patients_per_million Number of COVID-19 patients in hospital on a g...
21 weekly_icu_admissions Number of COVID-19 patients newly admitted to ...
22 weekly_icu_admissions_per_million Number of COVID-19 patients newly admitted to ...
23 weekly_hosp_admissions Number of COVID-19 patients newly admitted to ...
24 weekly_hosp_admissions_per_million Number of COVID-19 patients newly admitted to ...
25 total_tests Total tests for COVID-19
26 new_tests New tests for COVID-19 (only calculated for co...
27 total_tests_per_thousand Total tests for COVID-19 per 1,000 people
28 new_tests_per_thousand New tests for COVID-19 per 1,000 people
29 new_tests_smoothed New tests for COVID-19 (7-day smoothed). For c...
30 new_tests_smoothed_per_thousand New tests for COVID-19 (7-day smoothed) per 1,...
31 positive_rate The share of COVID-19 tests that are positive,...
32 tests_per_case Tests conducted per new confirmed case of COVI...
33 tests_units Units used by the location to report its testi...
34 total_vaccinations Total number of COVID-19 vaccination doses adm...
35 people_vaccinated Total number of people who received at least o...
36 people_fully_vaccinated Total number of people who received all doses ...
37 total_boosters Total number of COVID-19 vaccination booster d...
38 new_vaccinations New COVID-19 vaccination doses administered (o...
39 new_vaccinations_smoothed New COVID-19 vaccination doses administered (7...
40 total_vaccinations_per_hundred Total number of COVID-19 vaccination doses adm...
41 people_vaccinated_per_hundred Total number of people who received at least o...
42 people_fully_vaccinated_per_hundred Total number of people who received all doses ...
43 total_boosters_per_hundred Total number of COVID-19 vaccination booster d...
44 new_vaccinations_smoothed_per_million New COVID-19 vaccination doses administered (7...
45 new_people_vaccinated_smoothed Daily number of people receiving their first v...
46 new_people_vaccinated_smoothed_per_hundred Daily number of people receiving their first v...
47 stringency_index Government Response Stringency Index: composit...
48 population Population (latest available values). See http...
49 population_density Number of people divided by land area, measure...
50 median_age Median age of the population, UN projection fo...
51 aged_65_older Share of the population that is 65 years and o...
52 aged_70_older Share of the population that is 70 years and o...
53 gdp_per_capita Gross domestic product at purchasing power par...
54 extreme_poverty Share of the population living in extreme pove...
55 cardiovasc_death_rate Death rate from cardiovascular disease in 2017...
56 diabetes_prevalence Diabetes prevalence (% of population aged 20 t...
57 female_smokers Share of women who smoke, most recent year ava...
58 male_smokers Share of men who smoke, most recent year avail...
59 handwashing_facilities Share of the population with basic handwashing...
60 hospital_beds_per_thousand Hospital beds per 1,000 people, most recent ye...
61 life_expectancy Life expectancy at birth in 2019
62 human_development_index A composite index measuring average achievemen...
63 excess_mortality Percentage difference between the reported num...
64 excess_mortality_cumulative Percentage difference between the cumulative n...
65 excess_mortality_cumulative_absolute Cumulative difference between the reported num...
66 excess_mortality_cumulative_per_million Cumulative difference between the reported num...

Missing data analysis¶

In [8]:
df_full_covid_data.shape
Out[8]:
(197003, 67)
In [9]:
# Required fields for Delivery 1 and 2
required_fields = ['location', 'date',
                   'total_cases', 'population', 
                   'people_vaccinated', 'people_fully_vaccinated', 'total_boosters']

Filter out continental and world data, keep only country specific data¶

In [10]:
print('Non-country locations:\n', df_full_covid_data[df_full_covid_data['continent'].isnull()]['location'].unique())
Non-country locations:
 ['Africa' 'Asia' 'Europe' 'European Union' 'High income' 'International'
 'Low income' 'Lower middle income' 'North America' 'Oceania'
 'South America' 'Upper middle income' 'World']
In [11]:
df_full_covid_data = df_full_covid_data[df_full_covid_data['continent'].notnull()].reset_index(drop=True)

Bar chart representing relative completion of the data columns:¶

In [12]:
msno.bar(df_full_covid_data)
Out[12]:
<AxesSubplot:>
In [13]:
msno.bar(df_full_covid_data[required_fields])
Out[13]:
<AxesSubplot:>

Nullity matrix, similar to the bar chart, but spatially represents the missing information in the dataframe:¶

In [14]:
msno.matrix(df_full_covid_data)
Out[14]:
<AxesSubplot:>
In [15]:
msno.matrix(df_full_covid_data[required_fields])
Out[15]:
<AxesSubplot:>

Dendogram to visualize the correlation of completion between variables:¶

In [16]:
msno.dendrogram(df_full_covid_data)
Out[16]:
<AxesSubplot:>
In [17]:
msno.dendrogram(df_full_covid_data[required_fields])
Out[17]:
<AxesSubplot:>

Analyzing missing data of vaccinations by country¶

In [18]:
df_missing_vaccination_by_country = df_full_covid_data[[required_fields[i] for i in [0, -3, -2, -1]]].drop('location', 
                                                                                                           axis=1).isna().groupby(df_full_covid_data.location).sum().reset_index()

df_missing_vaccination_by_country['location_records'] = df_full_covid_data['location'].value_counts().reset_index().sort_values(by='index')['location'].reset_index(drop=True)

df_missing_vaccination_by_country['people_vaccinated'] = df_missing_vaccination_by_country['people_vaccinated']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['people_fully_vaccinated'] = df_missing_vaccination_by_country['people_fully_vaccinated']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['total_boosters'] = df_missing_vaccination_by_country['total_boosters']/df_missing_vaccination_by_country['location_records']

df_missing_vaccination_by_country['mean_total_missing'] = (df_missing_vaccination_by_country['people_vaccinated'] + df_missing_vaccination_by_country['people_fully_vaccinated'] + df_missing_vaccination_by_country['total_boosters']) / 3
In [19]:
fig = px.bar(df_missing_vaccination_by_country, x='location', y='mean_total_missing')
fig.show()
In [20]:
df_missing_vaccination_by_country.sort_values(by='mean_total_missing', ascending=True).head(10)
Out[20]:
location people_vaccinated people_fully_vaccinated total_boosters location_records mean_total_missing
111 Latvia 0.349 0.365 0.372 852 0.362
198 Switzerland 0.366 0.366 0.377 853 0.370
218 United States 0.373 0.373 0.400 887 0.382
63 Estonia 0.382 0.382 0.392 875 0.385
117 Lithuania 0.364 0.385 0.426 849 0.392
153 Norway 0.342 0.374 0.471 853 0.396
70 France 0.385 0.386 0.419 885 0.397
54 Denmark 0.365 0.381 0.445 876 0.397
7 Argentina 0.400 0.400 0.400 908 0.400
35 Canada 0.418 0.388 0.438 886 0.415

Missing Data Analysis Insights:¶

  • Total 197003 records in data
  • Missing values in many columns. Required fields for Delivery 1 and 2 have missing values in:
    • total_cases, population, people_vaccinated, people_fully_vaccinated, total_boosters
  • Missing values are significant in vaccination data
    • Missing values often significant for many countries/region
    • Using the mean missing information from the three vaccination measures (people_vaccinated people_fully_vaccinated total_boosters), top 3 countries with least missing information selected
      • Latvia, Switzerland, United States selected in current report
  • From nullity matrix and dendogram, we can see that missing values of columns can occur in pairs (correlated)
In [21]:
country_list = list(df_missing_vaccination_by_country.sort_values(by='mean_total_missing', ascending=True)['location'].head(3))
In [22]:
df_3country_covid_data = df_full_covid_data[df_full_covid_data['location'].isin(country_list)][required_fields].reset_index(drop=True)

Missing Value Imputation¶

  • Interpolating using pandas default function, in both direction to impute all missing values.
    • Executed separately for each country
In [23]:
impute_column_list = ['total_cases', 'population', 
                      'people_vaccinated', 'people_fully_vaccinated', 'total_boosters']
In [24]:
msno.matrix(df_3country_covid_data)
Out[24]:
<AxesSubplot:>
In [25]:
df_3country_covid_data_imputed = None

for current_country in list(df_3country_covid_data['location'].unique()):
    
    temp_df_current_country_data = df_3country_covid_data[df_3country_covid_data['location'] == current_country]
    temp_df_current_country_data = temp_df_current_country_data.sort_values(by='date', ascending=True)
    
    for col in impute_column_list:
        temp_df_current_country_data[col] = temp_df_current_country_data[col].interpolate(limit_direction="both")
    
    if df_3country_covid_data_imputed is None:
        df_3country_covid_data_imputed = temp_df_current_country_data
    else:
        df_3country_covid_data_imputed = pd.concat([df_3country_covid_data_imputed, temp_df_current_country_data],
                                                   ignore_index=True
                                                  )
In [26]:
msno.matrix(df_3country_covid_data_imputed)
Out[26]:
<AxesSubplot:>

Exploratory Data Analysis¶

Delivery 1: The relative cases overtime of Covid infectors (absolute Covid cases/population size)¶

In [27]:
df_3country_covid_data_imputed['infector_percentage'] = df_3country_covid_data_imputed['total_cases']/df_3country_covid_data_imputed['population']
In [28]:
fig = px.line(df_3country_covid_data_imputed, 
              x='date', y='infector_percentage', 
              color='location',
              # title = 'The relative cases overtime of Covid infectors'
             )
fig.update_layout(
    yaxis_title="Total cases as percentage of population",
    xaxis_title="Timeline",
    title = {
        'text':'The total affected rate (percentage of the population) over time',
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend = {
        'title':{'text':'Location'}
    }
)
fig.show(renderer='notebook')

Delivery 2: The vaccination rate (percentage of the population) over time¶

In [29]:
df_3country_covid_data_imputed['vaccination_dose1_percentage'] = df_3country_covid_data_imputed['people_vaccinated']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed['vaccination_dose2_percentage'] = df_3country_covid_data_imputed['people_fully_vaccinated']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed['vaccination_dose3_percentage'] = df_3country_covid_data_imputed['total_boosters']/df_3country_covid_data_imputed['population']

df_3country_covid_data_imputed_Transformed = None

for dose in range(1,4):
    
    col_list = ['location', 'date', 'vaccination_dose{}_percentage'.format(dose)]
    temp_df = df_3country_covid_data_imputed[col_list]
    
    temp_df['location'] = temp_df['location']+', '+str(dose)
    temp_df.columns = ['location, dose', 'date', 'vaccination_percentage']
    
    if df_3country_covid_data_imputed_Transformed is None:
        df_3country_covid_data_imputed_Transformed = temp_df
    else:
        df_3country_covid_data_imputed_Transformed = pd.concat([df_3country_covid_data_imputed_Transformed, temp_df],#
                                                               ignore_index=True
                                                              )
In [30]:
fig = None
fig = px.line(df_3country_covid_data_imputed_Transformed, 
              x='date', y='vaccination_percentage', 
              color='location, dose',
             )
fig.update_layout(
    yaxis_title="Percentage of population vaccinated",
    xaxis_title="Timeline",
    title = {
        'text':'The vaccination rate (percentage of the population) over time',
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend = {
        'title':{'text':'Location, Dosage'}
    }
)
fig.show(renderer='notebook')
In [ ]: